# Install dependencies using pip in the docker container
# https://medium.com/@shivangisareen/for-anyone-using-jupyter-notebook-installing-packages-18a9468d0c1c
import sys
!{sys.executable} -m pip install plotly pandas dataclasses_json stylecloud jieba
!{sys.executable} -m pip install -U kaleido
# Done with installation, so the following code will be clean (without the dependencies output, and re-running all the time
Requirement already satisfied: plotly in /opt/conda/lib/python3.9/site-packages (5.7.0) Requirement already satisfied: pandas in /opt/conda/lib/python3.9/site-packages (1.4.2) Requirement already satisfied: dataclasses_json in /opt/conda/lib/python3.9/site-packages (0.5.7) Requirement already satisfied: stylecloud in /opt/conda/lib/python3.9/site-packages (0.5.2) Requirement already satisfied: jieba in /opt/conda/lib/python3.9/site-packages (0.42.1) Requirement already satisfied: tenacity>=6.2.0 in /opt/conda/lib/python3.9/site-packages (from plotly) (8.0.1) Requirement already satisfied: six in /opt/conda/lib/python3.9/site-packages (from plotly) (1.16.0) Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.9/site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.9/site-packages (from pandas) (2022.1) Requirement already satisfied: numpy>=1.18.5 in /opt/conda/lib/python3.9/site-packages (from pandas) (1.21.6) Requirement already satisfied: marshmallow-enum<2.0.0,>=1.5.1 in /opt/conda/lib/python3.9/site-packages (from dataclasses_json) (1.5.1) Requirement already satisfied: marshmallow<4.0.0,>=3.3.0 in /opt/conda/lib/python3.9/site-packages (from dataclasses_json) (3.15.0) Requirement already satisfied: typing-inspect>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from dataclasses_json) (0.7.1) Requirement already satisfied: icon-font-to-png in /opt/conda/lib/python3.9/site-packages (from stylecloud) (0.4.1) Requirement already satisfied: wordcloud in /opt/conda/lib/python3.9/site-packages (from stylecloud) (1.8.1) Requirement already satisfied: matplotlib in /opt/conda/lib/python3.9/site-packages (from stylecloud) (3.5.1) Requirement already satisfied: fire in /opt/conda/lib/python3.9/site-packages (from stylecloud) (0.4.0) Requirement already satisfied: palettable in /opt/conda/lib/python3.9/site-packages (from stylecloud) (3.3.0) Requirement already satisfied: packaging in /opt/conda/lib/python3.9/site-packages (from marshmallow<4.0.0,>=3.3.0->dataclasses_json) (21.3) Requirement already satisfied: mypy-extensions>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from typing-inspect>=0.4.0->dataclasses_json) (0.4.3) Requirement already satisfied: typing-extensions>=3.7.4 in /opt/conda/lib/python3.9/site-packages (from typing-inspect>=0.4.0->dataclasses_json) (4.2.0) Requirement already satisfied: termcolor in /opt/conda/lib/python3.9/site-packages (from fire->stylecloud) (1.1.0) Requirement already satisfied: Pillow>=4.0.0 in /opt/conda/lib/python3.9/site-packages (from icon-font-to-png->stylecloud) (9.1.0) Requirement already satisfied: requests>=2.12.5 in /opt/conda/lib/python3.9/site-packages (from icon-font-to-png->stylecloud) (2.27.1) Requirement already satisfied: tinycss>=0.4 in /opt/conda/lib/python3.9/site-packages (from icon-font-to-png->stylecloud) (0.4) Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.9/site-packages (from matplotlib->stylecloud) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib->stylecloud) (4.33.2) Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib->stylecloud) (1.4.2) Requirement already satisfied: pyparsing>=2.2.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib->stylecloud) (3.0.8) Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/conda/lib/python3.9/site-packages (from requests>=2.12.5->icon-font-to-png->stylecloud) (2.0.12) Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from requests>=2.12.5->icon-font-to-png->stylecloud) (2021.10.8) Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.9/site-packages (from requests>=2.12.5->icon-font-to-png->stylecloud) (3.3) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.9/site-packages (from requests>=2.12.5->icon-font-to-png->stylecloud) (1.26.9) Requirement already satisfied: kaleido in /opt/conda/lib/python3.9/site-packages (0.2.1)
The dataset is collected using Spotify API and Genius API.
For a singer, we fetch all his/her albums, and subsequently fetch all the tracks within the album. For each track that we fetched, we also fetch the audio features and the lyrics. In the end, combining all the data into a json file.
from IPython.display import Image
# To run after editing, use shift+enter
##############################################################################################################
# import data structure
# coding
from cmath import sin
import customdatatypes
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# gosh this comment saves lives
import plotly.io as pio
pio.renderers.default = "iframe" # https://github.com/plotly/plotly.py/issues/2508#issuecomment-907338746
import os
import stylecloud
import jieba
# Plot
def show_image(path: str):
display(Image(filename=f"{path}"))
# Load data
english_singer_names = [
"Taylor Swift",
"Justin Bieber",
"Sia",
"Miley Cyrus",
]
chinese_singer_names = [
"Jay Chou",
"JJ Lin",
]
english_singers = []
for singer_name in english_singer_names:
with open(f'data/{singer_name}.json', 'r') as infile:
singer = customdatatypes.Singer.from_json(infile.read())
english_singers.append(singer)
chinese_singers = []
for singer_name in chinese_singer_names:
with open(f'data/{singer_name}.json', 'r') as infile:
singer = customdatatypes.Singer.from_json(infile.read())
chinese_singers.append(singer)
Currently we collected data from 6 singers, 4 English speaking and 2 Chinese speaking.
Because Spotify doesn't directly provide lyrics, so we have to use Genius to obtain the lyrics. For live concert tracks, often, the lyrics are missing.
# data set basic data
def draw_album_info(singers: list[customdatatypes.Singer]): # bar chart
singer_names = []
album_count = []
for singer in singers:
singer_names.append(singer.name)
album_count.append(len(singer.albums))
dataframe = pd.DataFrame(dict(singer=singer_names, albums=album_count))
fig = px.bar(dataframe, x=dataframe.singer, y=dataframe.albums, title="Albums (album + singles + live concerts) per singer")
# fig.show()
os.system(f"mkdir -p out/basics")
fig.write_image(f"out/basics/singers.svg")
# for jupyter, use png, due to performance reason
fig.write_image(f"out/basics/singers.png")
show_image(f"out/basics/singers.png")
def draw_track_and_lyrics_availbility(singers: list[customdatatypes.Singer]): # bar chart
singer_names = []
with_lyrics = []
without_lyrics = []
for singer in singers:
singer_names.append(singer.name)
tracks_with_lyrics = 0
tracks_without_lyrics = 0
for album in singer.albums:
for track in album.tracks:
if len(track.lyrics) == 0:
tracks_without_lyrics += 1
else:
tracks_with_lyrics += 1
with_lyrics.append(tracks_with_lyrics)
without_lyrics.append(tracks_without_lyrics)
dataframe = pd.DataFrame({'Singer': singer_names,
'has_lyrics': with_lyrics,
'no_lyrics': without_lyrics})
fig = px.bar(dataframe, x=dataframe.Singer, y=[dataframe.has_lyrics, dataframe.no_lyrics], title="Tracks per singer")
fig.update_yaxes(title="Total tracks")
# fig.show()
os.system(f"mkdir -p out/basics")
fig.write_image(f"out/basics/tracks.svg")
# for jupyter, use png, due to performance reason
fig.write_image(f"out/basics/tracks.png")
show_image(f"out/basics/tracks.png")
draw_album_info(singers=english_singers + chinese_singers)
draw_track_and_lyrics_availbility(singers=english_singers + chinese_singers)
The following graphs represents all tracks performed by a particular singer.
By looking at the radar graph, we can see the singer's signitures, as one singer might be more energetic than another, etc.
# audio features
def draw_audio_feature_of_all_songs_from_a_singer(singer: customdatatypes.Singer):
print(f"draw_audio_feature_of_all_songs_from_a_singer {singer.name}")
categories = [
'acousticness',
'danceability',
'energy',
'instrumentalness',
'liveness',
'speechiness',
'valence'
]
fig = go.Figure()
for album in singer.albums:
for track in album.tracks:
fig.add_trace(go.Scatterpolar(
r=[
track.acousticness,
track.danceability,
track.energy,
track.instrumentalness,
track.liveness,
track.speechiness,
track.valence
],
theta=categories,
fill='toself',
name=f"{track.name}"
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 1]
)),
showlegend=False
)
fig.update_layout(title_text=f"Audio features of all {singer.name} tracks")
# fig.show()
os.system(f"mkdir -p out/radar")
fig.write_image(f"out/radar/{singer.name}.svg")
# for jupyter, use png, due to performance reason
fig.write_image(f"out/radar/{singer.name}.png")
show_image(f"out/radar/{singer.name}.png")
def draw_audio_feature_of_all_albums_from_a_singer(singer: customdatatypes.Singer):
print(f"draw_audio_feature_of_all_albums_from_a_singer {singer.name}")
categories = [
'acousticness',
'danceability',
'energy',
'instrumentalness',
'liveness',
'speechiness',
'valence'
]
fig = go.Figure()
for album in singer.albums:
for track in album.tracks:
fig.add_trace(go.Scatterpolar(
r=[
track.acousticness,
track.danceability,
track.energy,
track.instrumentalness,
track.liveness,
track.speechiness,
track.valence
],
theta=categories,
fill='toself',
name=f"{track.name}"
))
print(f"draw_audio_feature_of_all_albums_from_a_singer {singer.name} {album.name}")
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 1]
)),
showlegend=False
)
fig.update_layout(title_text=f"{singer.name} - {album.name}")
# fig.show()
os.system(f"mkdir -p \"out/{singer.name}\"")
fig.write_image(f"out/{singer.name}/{album.name}.svg")
# for jupyter, use png, due to performance reason
fig.write_image(f"out/{singer.name}/{album.name}.svg")
show_image(f"out/{singer.name}/{album.name}.svg")
for singer in english_singers:
draw_audio_feature_of_all_songs_from_a_singer(singer=singer)
# draw_audio_feature_of_all_albums_from_a_singer(singer=singer)
for singer in chinese_singers:
draw_audio_feature_of_all_songs_from_a_singer(singer=singer)
draw_audio_feature_of_all_songs_from_a_singer Taylor Swift
draw_audio_feature_of_all_songs_from_a_singer Justin Bieber
draw_audio_feature_of_all_songs_from_a_singer Sia
draw_audio_feature_of_all_songs_from_a_singer Miley Cyrus
draw_audio_feature_of_all_songs_from_a_singer Jay Chou
draw_audio_feature_of_all_songs_from_a_singer JJ Lin
For all 6 singers, we would like to know what are their most used words, so we can perform more further analysis.
Sidenotes: for Chinese singers, we segmented the sentences beforehand, using Jieba.
# word cloud
def draw_word_cloud_for_english_singer(singer: customdatatypes.Singer):
print(f"draw_word_cloud_for_english_singer {singer.name}")
text = ""
for album in singer.albums:
for track in album.tracks:
if len(track.lyrics) > 0:
text += " " + track.lyrics.replace("oh", " ").replace("I", " ").replace("in", " ").replace("the", " ")
os.system(f"mkdir -p \"out/word_cloud\"")
name_split = singer.name.split(" ")
english_stop_words = ['verse', 'lyrics', 'chorus', 'pre']
english_stop_words += name_split
stylecloud.gen_stylecloud(text=text,
icon_name='fas fa-horse',
palette='colorbrewer.diverging.Spectral_11',
custom_stopwords=english_stop_words,
background_color='black',
gradient='horizontal',
output_name=f"out/word_cloud/{singer.name}.png")
show_image(path=f"out/word_cloud/{singer.name}.png")
def draw_word_cloud_for_chinese_singer(singer: customdatatypes.Singer):
# https://blog.csdn.net/qq_42374697/article/details/106269448
print(f"draw_word_cloud_for_chinese_singer {singer.name}")
text = ""
for album in singer.albums:
for track in album.tracks:
if len(track.lyrics) > 0:
# print(" ".join(jieba.cut(track.lyrics)))
text += " " + " ".join(jieba.cut(track.lyrics))
os.system(f"mkdir -p \"out/word_cloud\"")
name_split = singer.name.split(" ")
english_stop_words = ['verse', 'lyrics', 'chorus', 'pre']
english_stop_words += name_split
stylecloud.gen_stylecloud(text=text,
icon_name='fas fa-horse',
palette='colorbrewer.diverging.Spectral_11',
font_path='NotoSansCJKsc-VF.otf', # https://github.com/googlefonts/noto-cjk/tree/main/Sans
stopwords=True,
custom_stopwords=english_stop_words,
add_stopwords=True,
background_color='black',
gradient='horizontal',
output_name=f"out/word_cloud/{singer.name}.png")
show_image(path=f"out/word_cloud/{singer.name}.png")
for singer in english_singers:
draw_word_cloud_for_english_singer(singer=singer)
for singer in chinese_singers:
draw_word_cloud_for_chinese_singer(singer=singer)
draw_word_cloud_for_english_singer Taylor Swift
draw_word_cloud_for_english_singer Justin Bieber
draw_word_cloud_for_english_singer Sia
draw_word_cloud_for_english_singer Miley Cyrus
Building prefix dict from the default dictionary ... Loading model from cache /tmp/jieba.cache
draw_word_cloud_for_chinese_singer Jay Chou
Loading model cost 0.548 seconds. Prefix dict has been built successfully.
draw_word_cloud_for_chinese_singer JJ Lin
For all the singers, we would like to know how does the audio feature changes over time.
For each graph, the x-axis is the sorted album release time, and the y-axis is a particular audio feature of all the tracks within a given album. By looking at the graph from left to right, we can see how the singer evolves over time.
# audio feature changes over time for a singer
def draw_audio_feature_change_over_time(singer: customdatatypes.Singer):
print(f"draw_audio_feature_change_over_time {singer.name}")
fig = make_subplots(rows=1, cols=5, subplot_titles=('acousticness', 'energy', 'liveness', 'loudness', 'tempo'))
x = []
y = []
for (i, album) in enumerate(singer.albums):
for track in album.tracks:
x.append(album.release_date)
y.append(track.acousticness)
fig.add_trace(go.Scatter(x=x, y=y, mode="markers+text", showlegend=False), row=1, col=1)
fig.update_xaxes(title="Albums release date", row=1, col=1, rangemode="tozero")
fig.update_yaxes(title="Value", row=1, col=1, rangemode="tozero")
x = []
y = []
for (i, album) in enumerate(singer.albums):
for track in album.tracks:
x.append(album.release_date)
y.append(track.energy)
fig.add_trace(go.Scatter(x=x, y=y, mode="markers+text", showlegend=False), row=1, col=2)
fig.update_xaxes(title="Albums release date", row=1, col=2, rangemode="tozero")
fig.update_yaxes(title="Value", row=1, col=2, rangemode="tozero")
x = []
y = []
for (i, album) in enumerate(singer.albums):
for track in album.tracks:
x.append(album.release_date)
y.append(track.liveness)
fig.add_trace(go.Scatter(x=x, y=y, mode="markers+text", showlegend=False), row=1, col=3)
fig.update_xaxes(title="Albums release date", row=1, col=3, rangemode="tozero")
fig.update_yaxes(title="Value", row=1, col=3, rangemode="tozero")
x = []
y = []
for (i, album) in enumerate(singer.albums):
for track in album.tracks:
x.append(album.release_date)
y.append(track.loudness)
fig.add_trace(go.Scatter(x=x, y=y, mode="markers+text", showlegend=False), row=1, col=4)
fig.update_xaxes(title="Albums release date", row=1, col=4, rangemode="tozero")
fig.update_yaxes(title="Value", row=1, col=4, rangemode="tozero")
x = []
y = []
for (i, album) in enumerate(singer.albums):
for track in album.tracks:
x.append(album.release_date)
y.append(track.tempo)
fig.add_trace(go.Scatter(x=x, y=y, mode="markers+text", showlegend=False), row=1, col=5)
fig.update_xaxes(title="Albums release date", row=1, col=5, rangemode="tozero")
fig.update_yaxes(title="Value", row=1, col=5, rangemode="tozero")
fig.update_layout(height=400, width=1400, title_text=f"{singer.name}'s audio feature change over time")
# fig.show() # will have weird bug during rendering
os.system(f"mkdir -p \"out/audio_feature\"")
fig.write_image(f"out/audio_feature/{singer.name}.png")
show_image(f"out/audio_feature/{singer.name}.png")
for singer in english_singers+chinese_singers:
draw_audio_feature_change_over_time(singer=singer)
draw_audio_feature_change_over_time Taylor Swift
draw_audio_feature_change_over_time Justin Bieber
draw_audio_feature_change_over_time Sia
draw_audio_feature_change_over_time Miley Cyrus
draw_audio_feature_change_over_time Jay Chou
draw_audio_feature_change_over_time JJ Lin
We use QQ music to obtain the trendy musics on both Billboards and in China.
Notice that in 2022, only 14 weeks of data is available so far.
import json
import re
def load_qq_data(id: int) -> {}:
base_folder=f'data/qq'
folder=f"{base_folder}/{id}"
ret = {}
for y in range(2018, 2022 + 1):
ret[y] = {}
for w in range(1, 52 + 1):
if y == 2022 and w > 14:
break
try:
with open(f'{folder}/{y}_{w}.json', encoding='utf-8') as infile:
data = json.load(infile)
ret[y][w] = {}
for (i, track) in enumerate(data['data']['list']):
ret[y][w][i] = {}
ret[y][w][i]['title'] = track['title']
ret[y][w][i]['singerName'] = track['singerName']
ret[y][w][i]['mid'] = track['mid']
try:
mid = track['mid']
title = track['title']
title = title.replace("/", " ")
with open(f'{base_folder}/lyrics/{mid}_{title}.json', encoding='utf-8') as lyrics_file:
data = json.load(lyrics_file)
str_without_brackets = re.sub("[\(\[].*?[\)\]]", "", data['data']['lyric']) # TODO: remove more terms from the first couple of lines
ret[y][w][i]['lyrics'] = str_without_brackets
except Exception as e:
print(e, f'{base_folder}/lyrics/{mid}_{title}.json')
pass
except Exception as e:
print(e, f'{folder}/{y}_{w}.json')
pass
return ret
print("Start loading trend_data_Chinese")
trend_data_Chinese = load_qq_data(id=26)
print("Done loading trend_data_Chinese")
print("Start loading trend_data_billboard")
trend_data_billboard = load_qq_data(id=108)
print("Done loading trend_data_billboard")
Start loading trend_data_Chinese [Errno 2] No such file or directory: 'data/qq/26/2018_52.json' data/qq/26/2018_52.json [Errno 2] No such file or directory: 'data/qq/26/2019_52.json' data/qq/26/2019_52.json Done loading trend_data_Chinese Start loading trend_data_billboard [Errno 2] No such file or directory: 'data/qq/108/2018_1.json' data/qq/108/2018_1.json Done loading trend_data_billboard
The following graph is the number of unique songs that has been recorded on the top 100 weekly music board
def get_unique_songs_per_year(data: {}, title: str): # bar chart
year = []
count = []
for y in data:
year.append(y)
tmp = {}
for w in data[y]:
for t in data[y][w]:
tmp[data[y][w][t]['title']] = True
count.append(len(tmp))
# print(count)
fig = px.bar(x=year, y=count, title=f"{title}")
fig.update_xaxes(title="Year")
fig.update_yaxes(title="Unique tracks")
# fig.show()
os.system(f"mkdir -p out/qq")
fig.write_image(f"out/qq/get_unique_songs_per_year.svg")
# for jupyter, use png, due to performance reason
fig.write_image(f"out/qq/get_unique_songs_per_year.png")
show_image(f"out/qq/get_unique_songs_per_year.png")
get_unique_songs_per_year(data=trend_data_Chinese, title="Unique trending songs on the board per year (China)")
get_unique_songs_per_year(data=trend_data_billboard, title="Unique trending songs on the board per year (Billboard)")
def popular_singers_by_unique_songs_on_board_per_year(data: {}): # a) bar chart (x: songs on board (max ~50) b) y: singer count)
for y in data:
singers = {}
for w in data[y]:
for t in data[y][w]:
singer_split = data[y][w][t]['singerName'].split("/")
for s in singer_split:
if s in singers:
pass
else:
singers[s] = {}
singers[s][data[y][w][t]['title']] = True
# print(y, len(singers))
count = {}
for s in singers:
# print(s, len(singers[s]))
if len(singers[s]) in count:
pass
else:
count[len(singers[s])] = 0
count[len(singers[s])] += 1
mx = 0
for c in count:
# print(y, c, count[c])
mx = max(mx, c)
print("mx", y, mx)
popular_singers_by_unique_songs_on_board_per_year(data=trend_data_Chinese)
popular_singers_by_unique_songs_on_board_per_year(data=trend_data_billboard)
mx 2018 22 mx 2019 39 mx 2020 42 mx 2021 43 mx 2022 42 mx 2018 39 mx 2019 22 mx 2020 47 mx 2021 41 mx 2022 20
The following graph shows how many unique songs does each singer has on the top 100 board since 2018
For example, we can see that there are 656 singers that managed to have 1 song from their discography being listed on the Chinese top 100 music board, since 2018.
def popular_singers_by_unique_songs_on_board_all(data: {}, title: str): # a) bar chart (x: songs on board (max ~110), y: singer count) b) table
singers = {}
for y in data:
for w in data[y]:
for t in data[y][w]:
singer_split = data[y][w][t]['singerName'].split("/")
for s in singer_split:
if s in singers:
pass
else:
singers[s] = {}
singers[s][data[y][w][t]['title']] = True
# print(len(singers))
count = {}
for s in singers:
# print(s, len(singers[s]))
if len(singers[s]) in count:
pass
else:
count[len(singers[s])] = 0
count[len(singers[s])] += 1
mx = 0
for c in count:
# print(c, count[c])
mx = max(mx, c)
# print("mx", mx)
x = []
y = []
for i in range(1, mx + 1):
x.append(i)
if i in count:
y.append(count[i])
else:
y.append(0)
fig = px.bar(x=x, y=y, title=f"{title}", text_auto='3s')
fig.update_xaxes(title="Unique songs")
fig.update_yaxes(title="Number of singers")
fig.update_layout(height=400, width=1400)
# fig.show()
os.system(f"mkdir -p out/qq")
fig.write_image(f"out/qq/popular_singers_by_unique_songs_on_board_all.svg")
# for jupyter, use png, due to performance reason
fig.write_image(f"out/qq/popular_singers_by_unique_songs_on_board_all.png")
show_image(f"out/qq/popular_singers_by_unique_songs_on_board_all.png")
popular_singers_by_unique_songs_on_board_all(data=trend_data_Chinese, title="Unique songs per singer on board since 2018 (Chinese)")
popular_singers_by_unique_songs_on_board_all(data=trend_data_billboard, title="Unique songs per singer on board since 2018 (Billboard)")
def time_song_on_board_per_year(data: {}, title: str): # bar chart (x: on board week count, y: song count) b) table
tracks = {}
for y in data:
for w in data[y]:
for t in data[y][w]:
if data[y][w][t]['title'] in tracks:
pass
else:
tracks[data[y][w][t]['title']] = 0
tracks[data[y][w][t]['title']] += 1
# print(len(tracks))
count = {}
for t in tracks:
# print(s, len(singers[s]))
if tracks[t] in count:
pass
else:
count[tracks[t]] = 0
count[tracks[t]] += 1
mx = 0
for c in count:
# print(c, count[c])
mx = max(mx, c)
# print("mx", mx)
x = []
y = []
for i in range(1, mx + 1):
x.append(i)
if i in count:
y.append(count[i])
else:
y.append(0)
fig = px.bar(x=x, y=y, title=f"{title}", text_auto='10s')
fig.update_xaxes(title="Number of weeks a track stayed on the board")
fig.update_yaxes(title="Number of unique songs")
fig.update_layout(height=400, width=1400)
# fig.show()
os.system(f"mkdir -p out/qq")
fig.write_image(f"out/qq/popular_singers_by_unique_songs_on_board_all.svg")
# for jupyter, use png, due to performance reason
fig.write_image(f"out/qq/popular_singers_by_unique_songs_on_board_all.png")
show_image(f"out/qq/popular_singers_by_unique_songs_on_board_all.png")
time_song_on_board_per_year(data=trend_data_Chinese, title="Duration of weeks a track is on the board (Chinese)")
time_song_on_board_per_year(data=trend_data_billboard, title="Duration of weeks a track is on the board (Billboard)")
The following garph shows the word cloud using the lyrics from the songs on the board from 2018, 2019, 2020, 2021, 2022
def english_word_cloud_per_year(data: {}):
for y in data:
text = ""
for w in data[y]:
for t in data[y][w]:
if len(data[y][w][t]['lyrics']) > 0:
text += " " + data[y][w][t]['lyrics'].replace("'", " ")
os.system(f"mkdir -p \"out/word_cloud\"")
stylecloud.gen_stylecloud(text=text,
icon_name='fas fa-horse',
background_color='black',
gradient='horizontal',
output_name=f"out/word_cloud/english_{y}.png")
print(f"year {y}")
show_image(path=f"out/word_cloud/english_{y}.png")
# def chinese_word_cloud_per_year(data: {}):
# text += " " + " ".join(jieba.cut(track.lyrics))
# os.system(f"mkdir -p \"out/word_cloud\"")
# name_split = singer.name.split(" ")
# english_stop_words = ['verse', 'lyrics', 'chorus', 'pre']
# english_stop_words += name_split
# stylecloud.gen_stylecloud(text=text,
# icon_name='fas fa-horse',
# palette='colorbrewer.diverging.Spectral_11',
# font_path='NotoSansCJKsc-VF.otf', # https://github.com/googlefonts/noto-cjk/tree/main/Sans
# stopwords=True,
# custom_stopwords=english_stop_words,
# add_stopwords=True,
# background_color='black',
# gradient='horizontal',
# output_name=f"out/word_cloud/{singer.name}.png")
# show_image(path=f"out/word_cloud/{singer.name}.png")
english_word_cloud_per_year(data=trend_data_billboard)
year 2018
year 2019
year 2020
year 2021
year 2022
def graph_billboard_total_distinct_singers(data: {}):
class tmp:
def __init__(self, singer_name: str, track_count: int):
self.singer_name = singer_name
self.track_count = track_count
converted = []
for key, value in data.items():
converted.append(tmp(singer_name=key, track_count=value))
import operator
converted.sort(key=operator.attrgetter('track_count'), reverse=True)
singer_names = []
track_counts = []
for tup in converted:
singer_names.append(tup.singer_name)
track_counts.append(tup.track_count)
fig = go.Figure(data=[go.Table(header=dict(values=['Singer name', 'Track count']),
cells=dict(values=[singer_names, track_counts]))])
fig.show()
# os.system(f"mkdir -p out/cache")
# fig.write_image(f"out/cache/get_billboard_total_distinct_singers.png")
# show_image(f"out/cache/get_billboard_total_distinct_singers.png")
def get_billboard_total_distinct_singers(data: {}, cutoff: int):
# all, top 50, top 20
tmp = {}
for y in data:
for w in data[y]:
for t in data[y][w]:
singers = str(data[y][w][t]['singerName'])
singerList = singers.split("/")
# print(singerList)
for singer in singerList:
if t <= cutoff:
if singer in tmp:
tmp[singer] += 1
else:
tmp[singer] = 1
print(f"Distinct singers (top {cutoff} of all weeks) = {len(tmp)}")
graph_billboard_total_distinct_singers(tmp)
get_billboard_total_distinct_singers(data=trend_data_billboard, cutoff=100)
Distinct singers (top 100 of all weeks) = 906
get_billboard_total_distinct_singers(data=trend_data_billboard, cutoff=50)
Distinct singers (top 50 of all weeks) = 537
get_billboard_total_distinct_singers(data=trend_data_billboard, cutoff=30)
Distinct singers (top 30 of all weeks) = 354
get_billboard_total_distinct_singers(data=trend_data_billboard, cutoff=20)
Distinct singers (top 20 of all weeks) = 262
def graph_billboard_distinct_singers_per_week(billboard: {}):
pass
def check_billboard_missing_singers_on_spotify():
pass
def check_billboard_missing_songs_on_spotify():
pass